This file documents our reanalysis of the dataset used to examine biodiversity and conflict relationships in the Southern Philippines presented in this paper https://doi.org/10.1038/s44185-024-00044-8.\ Here, we consider spatial autocorrelation.
Files needed
##1. "cdh" folder containing the raster files of tree cover, tree density and forest canopy height
##2. "landcover" folder containing the raster files of 1988 and 2020 land cover types
##3. "mobios_tab.csv" or the biodiversity data
##4. "conflict_tab_89-21.csv" or the conflict data
##5. "functions.R" for calculating non-significant autocorrelated average distance
##6. "1988_lc_classmap.csv" 1988 land cover types code
##7. "2020_lc_classmap.csv" 2020 land cover types code
##8. "mindanao_adm_geo.gpkg" Mindanao boundary
Load required packages
## make sure to install all packages including required dependencies prior to use
re.lib <- c("dplyr", "tidyverse", "ggplot2", "patchwork", "GeoThinneR", "ecodist",
"sf", "sp", "spdep", "raster", "MASS", "terra")
new_packages <- re.lib[!(re.lib %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages)
# load packages
invisible(lapply(re.lib, library, character.only = TRUE))
Prepare datasets
## biodiversity data obtained from https://doi.org/10.15468/rtedgk
## conflict data obtained from https://data.humdata.org/dataset/ucdp-data-for-philippines?
## note that only a subset of this dataset (i.e., those from Mindanao and adjacent islands) was used
## read biodiversity data
bio <- read.csv("data/mobios_tab.csv", header = TRUE, sep = ",")
#subset data
bio.sub <- subset(bio, select = c(12, 23, 20, 15, 16))
bio.sub <- bio.sub %>% filter(county!="") #remove rows with no county information
bio.sub <- bio.sub %>% filter(class!="Malacostraca") #exclude this taxon
bio.sub <- bio.sub %>% filter(class!="Bivalvia") #exclude this taxon
bio.sub <- bio.sub %>% filter(class!="Gastropoda")#exclude this taxon
## check province name
## there are rows where the name of the province is uncertain as indicated by "/"
## "Zamboanga del Norte" was changed to "Zamboanga del Norte" prior to import of data
unique(bio.sub$county)
## [1] "Lanao del Norte"
## [2] "South Cotabato"
## [3] "North Cotabato"
## [4] "Camiguin Island"
## [5] "Maguindanao"
## [6] "Agusan del Sur"
## [7] "Bukidnon"
## [8] "Davao Oriental"
## [9] "Davao"
## [10] "Misamis Occidental"
## [11] "Surigao del Sur"
## [12] "Davao del sur"
## [13] "Lanao del Sur"
## [14] "Sarangani"
## [15] "Misamis Oriental"
## [16] "Surigao del Norte"
## [17] "Davao de Oro"
## [18] "Davao del Norte"
## [19] "Zamboanga del Sur"
## [20] "Surigao del Norte/Agusan del Sur"
## [21] "Dinagat Island"
## [22] "Basilan"
## [23] "Tawi-Tawi"
## [24] "Sulu"
## [25] "Sultan Kudarat"
## [26] "Agusan del Norte"
## [27] "Cagayan de Oro"
## [28] "Bukidnon/Camiguin"
## [29] "Bukidnon/Misamis Oriental/Iligan"
## [30] "Zamboanga del Norte"
## [31] "Zamboanga Sibugay"
## [32] "Misamis Occidental/Misamis Oriental/Camiguin/Bukidnon"
## [33] "Zamboanga City"
## [34] "General Santos"
## [35] "North Cotabato/Davao City"
## remove uncertain province names
bio.sub <- bio.sub %>% filter(!grepl('/', county))
head(bio.sub)
## write data
write.csv(bio.sub, "bio.sub.csv")
## read conflict data
con <- read.csv("data/conflict_tab_89-21.csv", header = TRUE, sep = ",")
## subset to get revelant data
con.sub <- subset(con, select = c(1,3,15,18,30,32,33))
head(con.sub)
## export filtered conflic data
write.csv(con.sub, "con.sub.csv")
Address spatial autocorrelation in both biodiversity and conflict data
## this part was not done in the paper, which may affect the analysis
## first, filter species with duplicate coordinates
bio.unique <- bio.sub %>%
group_by(scientificName) %>%
distinct(decimalLongitude, decimalLatitude, .keep_all = TRUE)
## then find the non-significant autocorrelated average distance to thin the data\
## based on the relationship between land cover types
## use the spatialautocorrelation function
## from https://github.com/jorgeassis/spatialAutocorrelation
source("data/functions.R")
## read data
files <- list.files("data/landcover/", pattern = "tif", full.names=TRUE)
raster.files <- lapply(files, raster)
raster.files
## [[1]]
## class : RasterLayer
## dimensions : 736, 903, 664608 (nrow, ncol, ncell)
## resolution : 0.008000454, 0.007996592 (x, y)
## extent : 119.3807, 126.6051, 4.58649, 10.47198 (xmin, xmax, ymin, ymax)
## crs : +proj=longlat +datum=WGS84 +no_defs
## source : 1988_lc_rast1km.tif
## names : X1988_lc_rast1km
##
##
## [[2]]
## class : RasterLayer
## dimensions : 736, 903, 664608 (nrow, ncol, ncell)
## resolution : 0.008000454, 0.007996592 (x, y)
## extent : 119.3807, 126.6051, 4.58649, 10.47198 (xmin, xmax, ymin, ymax)
## crs : +proj=longlat +datum=WGS84 +no_defs
## source : 2020_lc_rast1km.tif
## names : X2020_lc_rast1km
## make rasters to have common extent
common_extent <- do.call(union, lapply(raster.files, extent))
raster.files <- lapply(raster.files, function(r) {
extend(r, common_extent)
extent(r) <- common_extent
r
})
## check extent
lapply(raster.files, extent)
## [[1]]
## class : Extent
## xmin : 119.3807
## xmax : 126.6051
## ymin : 4.58649
## ymax : 10.47198
##
## [[2]]
## class : Extent
## xmin : 119.3807
## xmax : 126.6051
## ymin : 4.58649
## ymax : 10.47198
## resample raster files to have a uniform dimension/resolution
standard <- raster.files[[1]]
raster.files <- lapply(raster.files, function(r) {
resample(r, standard, method = "bilinear")
})
## check resolution
lapply(raster.files, res)
## [[1]]
## [1] 0.008000454 0.007996592
##
## [[2]]
## [1] 0.008000454 0.007996592
## stack the raster files
lc.data <- stack(raster.files)
## get longitude and latitude columns
bio.coor <- as.data.frame(subset(bio.unique, select = c("decimalLongitude", "decimalLatitude")))
## define the distance class (in km)
autocorrelationClassDistance <- 2
## define the maximum distance (in km)
autocorrelationMaxDistance <- 10
## define the significance level of the test
autocorrelationSignif <- 0.05
distanceUncorr <- data.frame(Predictor=names(lc.data),Distance=NA)
for( i in 1:length(names(lc.data)))
distanceUncorr[i,2] <- spatialAutocorrelation(occurrenceRecords=bio.coor,subset(lc.data,i),
autocorrelationClassDistance,autocorrelationMaxDistance,autocorrelationSignif)
##
##
## More than 1000 occurrence records.
## Using a maximum of 1000 random records.
##
##
## First non-correlated distance: 4 km
##
## More than 1000 occurrence records.
## Using a maximum of 1000 random records.
##
##
## First non-correlated distance: 4 km
meanCorrDistance <- mean(distanceUncorr[,2])
meanCorrDistance
## [1] 4
## thin every species data using the mean distance
## species name
sp.name <- unique(bio.unique$scientificName)
## create an empty list to store the results
thin.data <- list()
## go over each species
for (spp in sp.name) {
bio.spp <- subset(bio.unique, scientificName == spp)
bio.thin <- thin_points(
data = bio.spp,
long_col = "decimalLongitude",
lat_col = "decimalLatitude",
method = "brute_force",
thin_dist = meanCorrDistance, # thinning distance in km
trials = 1, # number of replicates
all_trials = TRUE, # return all trials
seed = 123 # seed for reproducibility
)
## convert results to a data frame
bio.thin.df <- as.data.frame(bio.thin[1])
## add the species name
bio.thin.df$scientificName <- spp
## append the data to the list
thin.data[[spp]] <- bio.thin.df
}
## combine all thinned
bio.thin.df <- do.call(rbind, thin.data)
## write to a csv
write.csv(bio.thin.df, "bio.thin.csv")
## convert thin data to sf object
bio.thin.sf <- st_as_sf(bio.thin.df, coords = c("decimalLongitude", "decimalLatitude"), crs = 4326)
## plot biodiversity data
## read Mindanao admin boundary
ph.poly <- st_read("data/mindanao_adm_geo.gpkg")
## Reading layer `mindanao_adm__single_parts' from data source
## `/home/christian/Downloads/Supplementary_FS3/data/mindanao_adm_geo.gpkg'
## using driver `GPKG'
## Simple feature collection with 1838 features and 3 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 119.0302 ymin: 4.587184 xmax: 126.6051 ymax: 10.47158
## Geodetic CRS: WGS 84
## plot
ggplot() +
geom_sf(data = ph.poly) +
geom_sf(data = bio.thin.sf, color = "blue", size = 2) +
theme_minimal() +
labs(title = "Thinned Biodiversity Data")
## check the number of records (original vs thinned data)
nrow(bio.sub) # original data
## [1] 10038
nrow(bio.thin.df) # thinned data
## [1] 8074
## apply the same apporach to conflict data
con.unique <- con.sub %>%
group_by(PROVINCE) %>%
distinct(longitude, latitude, .keep_all = TRUE)
## get longitude and latitude columns
con.coor <- as.data.frame(subset(con.unique, select = c("longitude", "latitude")))
## define the distance class
autocorrelationClassDistance <- 2
## define the maximum distance (in km)
autocorrelationMaxDistance <- 10
## define the significance level of the test
autocorrelationSignif <- 0.05
distanceUncorr <- data.frame(Predictor=names(lc.data),Distance=NA)
for( i in 1:length(names(lc.data)))
distanceUncorr[i,2] <- spatialAutocorrelation(occurrenceRecords=con.coor,subset(lc.data,i),
autocorrelationClassDistance,autocorrelationMaxDistance,autocorrelationSignif)
##
##
## More than 1000 occurrence records.
## Using a maximum of 1000 random records.
##
##
## First non-correlated distance: 4 km
##
## More than 1000 occurrence records.
## Using a maximum of 1000 random records.
##
##
## First non-correlated distance: 4 km
meanCorrDistance <- mean(distanceUncorr[,2])
meanCorrDistance
## [1] 4
## thin every province data using the mean distance
## use province name
prov.name <- unique(con.unique$PROVINCE)
## create an empty list to store the results
thin.data <- list()
## go over each province
for (prov in prov.name) {
con.prov <- subset(con.unique, PROVINCE == prov)
con.thin <- thin_points(
data = con.prov,
long_col = "longitude",
lat_col = "latitude",
method = "brute_force",
thin_dist = meanCorrDistance, # thinning distance in km
trials = 1, # number of replicates
all_trials = TRUE, # return all trials
seed = 123 # seed for reproducibility
)
## convert results to a data frame
con.thin.df <- as.data.frame(con.thin[1])
## add the province name
con.thin.df$PROVINCE <- prov
## append the data to the list
thin.data[[prov]] <- con.thin.df
}
## combine all thinned
con.thin.df <- do.call(rbind, thin.data)
## write to a csv
write.csv(con.thin.df, "con.thin.csv")
## convert thin data to sf object
con.thin.sf <- st_as_sf(con.thin.df, coords = c("longitude", "latitude"), crs = 4326)
## plot conflict data
## read Mindanao admin boundary
ph.poly <- st_read("data/mindanao_adm_geo.gpkg")
## Reading layer `mindanao_adm__single_parts' from data source
## `/home/christian/Downloads/Supplementary_FS3/data/mindanao_adm_geo.gpkg'
## using driver `GPKG'
## Simple feature collection with 1838 features and 3 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 119.0302 ymin: 4.587184 xmax: 126.6051 ymax: 10.47158
## Geodetic CRS: WGS 84
## plot
ggplot() +
geom_sf(data = ph.poly) +
geom_sf(data = con.thin.sf, color = "red", size = 2) +
theme_minimal() +
labs(title = "Thinned Conflict Data")
## check the number of records (original vs thinned data)
nrow(con.sub) # original data
## [1] 2583
nrow(con.thin.df) # thinned data
## [1] 748
Count recorded species per taxon in each point and within province
## the procedures for conducting species counts in each site were not mentioned in the paper
## according to the paper, analysis was done at the provincial level
## there could be two ways that species counts were done (see image below)
## this part is extremely important, which the authors failed to discuss
## the left panel of the figure shows that species counts were aggregated at the provincial level
## each point (within the province) receives the same value of species count (or species richness)
## the right panel shows that each point has a unique number of species count
## note that these values are crucial for other downstream analyses
## particularly when relationships of species counts
## and distance to the nearest conflict sites are considered
## we generated the values for these two scenarios below
## and test whether any of them would match the results presented in the paper
knitr::include_graphics("images/bioconflict.png")
## count unique records (i.e., species richness) per point
bio.data.sum <- bio.thin.df %>%
group_by(county, class, decimalLatitude, decimalLongitude) %>%
mutate(NumbSp = n_distinct(scientificName))
head(bio.data.sum)
## write to a csv
write.csv(bio.data.sum, "bio.data.sum.csv")
## count species records per province (this lumps all the data within each province,
## thus each point will have a single species richness)
prov.bio.data.sum <- bio.thin.df %>%
group_by(county, class) %>%
mutate(NumbSp = n_distinct(scientificName))
head(prov.bio.data.sum)
## write to a csv
write.csv(prov.bio.data.sum, "prov.data.sum.csv")
Calculate distance of species record to the nearest conflict sites using QGIS
## in QGIS, import the "bio.data.sum.csv" and "con.thin.csv" from R as delimited text files.
## export these files as GeoPackage or Shapefile to convert the data to meters prior to calculation
## use the CRS UTM Zone 51N
knitr::include_graphics("images/qgis_geo.png")
knitr::include_graphics("images/qgis_utm.png")
## import back the newly saved data to QGIS
## get the distance of nearest conflict areas using the "Joint attributes by nearest" plugin
## QGIS > Processing Toolbox > Join attributes by nearest
knitr::include_graphics("images/qgis_join.png")
## distance can be calculated as well without joining the attributes using the "Distance to nearest hub"
## after calculating the distance, a new layer will be created.
## the new layer contains the calculated distance in the attribute table
## this also includes the x and y coordinates of the nearest conflict site
## then export this new layer as a csv file so we can use the data in R for other analysis
## repeat the entire process for provincial data (i.e., prov.bio.data.sum.csv) if necessary
## figure below shows the nearest conflict point to species record (black line)
## notice the remainder of conflict points are not included
knitr::include_graphics("images/hub.png")